Exploratory Data Analytics

Code
import pandas as pd

eda = pd.read_csv('data/eda_data.csv')
eda.head()
COMPANY LOCATION POSTED MIN_EDULEVELS_NAME MAX_EDULEVELS_NAME MIN_YEARS_EXPERIENCE MAX_YEARS_EXPERIENCE TITLE SKILLS SPECIALIZED_SKILLS ... COMMON_SKILLS SOFTWARE_SKILLS SOC_2021_4_NAME NAICS_2022_6 NAICS2_NAME REMOTE_TYPE_NAME SALARY TITLE_NAME SKILLS_NAME SPECIALIZED_SKILLS_NAME
0 894731.0 {\n "lat": 33.20763,\n "lon": -92.6662674\n} 6/2/2024 Bachelor's degree Master's degree 2.000000 2.000000 ET29C073C03D1F86B4 [\n "KS126DB6T061MHD7RTGQ",\n "KS126706DPFD3... [\n "KS126DB6T061MHD7RTGQ",\n "KS128006L3V0H... ... [\n "KS126706DPFD3354M7YK",\n "KS1280B68GD79... [\n "KS440W865GC4VRBW6LJP",\n "KS13USA80NE38... Data Scientists 441330.0 Retail Trade [None] 116300.0 Enterprise Analysts [\n "Merchandising",\n "Mathematics",\n "Pr... [\n "Merchandising",\n "Predictive Modeling"...
1 133098.0 {\n "lat": 44.3106241,\n "lon": -69.7794897\n} 6/2/2024 No Education Listed Master's degree 3.000000 3.000000 ET21DDA63780A7DC09 [\n "KS122626T550SLQ7QZ1C",\n "KS123YJ6KVWC9... [\n "KS122626T550SLQ7QZ1C",\n "KS123YJ6KVWC9... ... [] [\n "BGSBF3F508F7F46312E3",\n "ESEA839CED378... Data Scientists 561320.0 Administrative and Support and Waste Managemen... Remote 116300.0 Oracle Consultants [\n "Procurement",\n "Financial Statements",... [\n "Procurement",\n "Financial Statements",...
2 39063746.0 {\n "lat": 32.7766642,\n "lon": -96.7969879\n} 6/2/2024 Bachelor's degree Master's degree 5.000000 3.773903 ET3037E0C947A02404 [\n "KS1218W78FGVPVP2KXPX",\n "ESF3939CE1F80... [\n "ESF3939CE1F80C10C327",\n "KS120GV6C72JM... ... [\n "KS1218W78FGVPVP2KXPX",\n "BGS1ADAA36DB6... [\n "KS126HY6YLTB9R7XJC4Z"\n] Data Scientists 524291.0 Finance and Insurance [None] 116300.0 Data Analysts [\n "Management",\n "Exception Reporting",\n... [\n "Exception Reporting",\n "Data Analysis"...
3 37615159.0 {\n "lat": 33.4483771,\n "lon": -112.0740373\n} 6/2/2024 No Education Listed Master's degree 3.000000 3.773903 ET2114E0404BA30075 [\n "KS123QX62QYTC4JF38H8",\n "KS7G6NP6R6L1H... [\n "KS123QX62QYTC4JF38H8",\n "KS441PQ64HT13... ... [\n "KS7G6NP6R6L1H1SKFTSY",\n "KS1218W78FGVP... [\n "KS4409D76NW1S5LNCL18",\n "ESC7869CF7378... Data Scientists 522110.0 Finance and Insurance [None] 116300.0 Management Analysts [\n "Exit Strategies",\n "Reliability",\n "... [\n "Exit Strategies",\n "User Story",\n "H...
4 0.0 {\n "lat": 37.6392595,\n "lon": -120.9970014\n} 6/2/2024 No Education Listed Master's degree 5.486444 3.773903 ET0000000000000000 [] [] ... [] [] Data Scientists 999999.0 Unclassified Industry [None] 92500.0 Unclassified [] []

5 rows × 21 columns

Code
# identifying data analyst jobs by keyword searching
keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
            'Data Science', 'Data Analysis','Data Analytics',  'Market Research Analyst' 
            'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
            'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']

match = lambda col: eda[col].str.contains('|'.join(keywords), case=False, na=False)

eda['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
             | match('SKILLS_NAME') \
             | match('SPECIALIZED_SKILLS_NAME') 
eda['DATA_ANALYST_JOB'].value_counts()
DATA_ANALYST_JOB
False    37043
True     32155
Name: count, dtype: int64
Code
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# -----------------------------------------------------------------------------
# 1) Prepare your data
# -----------------------------------------------------------------------------
df_grouped = (
    eda
    .groupby(['DATA_ANALYST_JOB','NAICS2_NAME'])
    .size()
    .reset_index(name='Job_Count')
)

short_names = {
    'Professional, Scientific, and Technical Services': 'Prof. Services',
    'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
    'Health Care and Social Assistance': 'Healthcare',
    'Finance and Insurance': 'Finance',
    'Information': 'Info Tech',
    'Educational Services': 'Education',
    'Manufacturing': 'Manufacturing',
    'Retail Trade': 'Retail',
    'Accommodation and Food Services': 'Hospitality',
    'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_names).fillna(df_grouped['NAICS2_NAME'])
df_grouped['Job_Type'] = df_grouped['DATA_ANALYST_JOB'].map({True:'True', False:'False'})

pivot = (
    df_grouped
    .pivot_table(index='Industry', columns='Job_Type', values='Job_Count', fill_value=0)
    .reset_index()
)
industries = pivot['Industry'].tolist()
y_true  = pivot['True'].tolist()
y_false = pivot['False'].tolist()

# -----------------------------------------------------------------------------
# 2) Build a 2-row subplot: bar on top, table below
# -----------------------------------------------------------------------------
fig = make_subplots(
    rows=2, cols=1,
    row_heights=[0.70, 0.30],           # give a bit more room to the table
    specs=[[{"type":"bar"}],[{"type":"table"}]],
    vertical_spacing=0.12              # more space between bar and table
)

colors = {'True': '#FFE5E5', 'False': '#FF6B6B'}

fig.add_trace(
    go.Bar(
        x=industries, y=y_true, name='True',
        marker=dict(color=colors['True'], line=dict(color='#A81D1D', width=1)),
        text=y_true, textposition='outside'
    ),
    row=1, col=1
)
fig.add_trace(
    go.Bar(
        x=industries, y=y_false, name='False',
        marker=dict(color=colors['False'], line=dict(color='#A81D1D', width=1)),
        text=y_false, textposition='outside'
    ),
    row=1, col=1
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["Industry","True","False"],
            fill_color='#FDEDEC',
            align='left',
            font=dict(color='#A81D1D', size=13),
            height=30
        ),
        cells=dict(
            values=[industries, y_true, y_false],
            fill_color='white',
            align='left',
            font=dict(color='#333', size=11),
            height=22
        )
    ),
    row=2, col=1
)

# -----------------------------------------------------------------------------
# 3) Slider steps: 0 → 8 000 in 200s
# -----------------------------------------------------------------------------
steps = []
for val in range(0, 8001, 200):
    steps.append(dict(
        label=str(val),
        method="update",
        args=[
            {"y": [
                [v if v>=val else 0 for v in y_true],
                [v if v>=val else 0 for v in y_false]
            ]},
            {"title": f"Min Jobs ≥ {val:,}"}
        ]
    ))

# -----------------------------------------------------------------------------
# 4) Final layout tweaks
# -----------------------------------------------------------------------------
fig.update_layout(
    # lift slider above everything
    sliders=[dict(
        active=0,
        currentvalue={"prefix":"Min Jobs: "},
        pad={"b":0},
        x=0.15,
        y=1.18,                # move slider way above the plot area
        xanchor="left",
        yanchor="bottom",
        len=0.7,
        font=dict(color='#A81D1D'),
        steps=steps
    )],

    title=dict(
        text="Data & Business Analytics Job Trends",
        font=dict(size=24, color='#A81D1D'),
        x=0.5,
        y=0.92,                # drop the title just below the slider
        xanchor="center",
        yanchor="top"
    ),

    width=1100, height=850,
    margin=dict(l=60, r=60, t=180, b=200),  # extra top & bottom margin

    plot_bgcolor='white',
    paper_bgcolor='white',

    xaxis=dict(
        title="Industry",
        title_font=dict(size=16, color='#A81D1D'),
        tickmode='array',
        tickvals=list(range(len(industries))),
        ticktext=industries,
        tickangle=-30,
        tickfont=dict(size=11, color='#333'),
        showline=True, linecolor='#A81D1D'
    ),
    yaxis=dict(
        title="Number of Jobs",
        title_font=dict(size=16, color='#A81D1D'),
        tickfont=dict(size=11, color='#333'),
        gridcolor='rgba(200,200,200,0.3)',
        showline=True, linecolor='#A81D1D',
        range=[0, max(max(y_true),max(y_false))*1.2]
    ),

    legend=dict(
        title="Data Analyst Job",
        title_font=dict(color='#A81D1D'),
        font=dict(size=12),
        x=1.02, y=0.5
    ),

    bargap=0.2
)

fig.show()
Code
import plotly.express as px
import pandas as pd

# Prepare the data
df = eda.copy()

# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
    if row['DATA_ANALYST_JOB']:
        return True
    title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
    if 'business analyst' in title:
        return True
    return False

df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Create the box plot
fig = px.box(df, 
             x='REMOTE_TYPE_NAME', 
             y='SALARY', 
             color='Job_Category',
             title='Salary Distribution by Remote Type for Analytics vs Non-Analytics Jobs',
             labels={'REMOTE_TYPE_NAME': 'Remote Type', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'},
             color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})

# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
    width=900,
    height=600,
    plot_bgcolor='#FFFFFF',  # Plain white background
    paper_bgcolor='#FFFFFF',  # Plain white background
    font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
    title=dict(
        font=dict(size=24, color="#FF6B6B"),  # Red title for theme
        x=0.5,
        xanchor="center",
        y=0.95,
        yanchor="top"
    ),
    xaxis=dict(
        title="Remote Type",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True
    ),
    yaxis=dict(
        title="Salary ($)",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    legend=dict(
        title="Job Category",
        font=dict(size=13),
        bgcolor="#FFFFFF",
        bordercolor="#FF6B6B",  # Red border for theme
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor="left",
        yanchor="middle"
    ),
    hovermode="closest",
    hoverlabel=dict(
        bgcolor="#FFFFFF",
        font_size=12,
        font_family="Inter, sans-serif",
        font_color="#2D3748",
        bordercolor="#FF6B6B"  # Red border for hover
    )
)

# Show the plot
fig.show()
Code
import plotly.express as px
import pandas as pd

# Prepare the data
df = eda.copy()

# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
    if row['DATA_ANALYST_JOB']:
        return True
    title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
    if 'business analyst' in title:
        return True
    return False

df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Group by industry and job category
df_grouped = df.groupby(['NAICS2_NAME', 'IS_ANALYTICS_JOB']).size().reset_index(name='Job_Count')
df_grouped['Job_Category'] = df_grouped['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Shorten industry names for better readability
short_map = {
    'Professional, Scientific, and Technical Services': 'Prof. Services',
    'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
    'Health Care and Social Assistance': 'Healthcare',
    'Finance and Insurance': 'Finance',
    'Information': 'Info Tech',
    'Educational Services': 'Education',
    'Manufacturing': 'Manufacturing',
    'Retail Trade': 'Retail',
    'Accommodation and Food Services': 'Hospitality',
    'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_map).fillna(df_grouped['NAICS2_NAME'])

# Create the stacked bar chart
fig = px.bar(df_grouped, 
             x='Industry', 
             y='Job_Count', 
             color='Job_Category',
             title='Top Industries Hiring Analytics Jobs',
             labels={'Industry': 'Industry', 'Job_Count': 'Number of Jobs', 'Job_Category': 'Job Category'},
             barmode='stack',
             color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})

# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
    width=1000,
    height=600,
    plot_bgcolor='#FFFFFF',  # Plain white background
    paper_bgcolor='#FFFFFF',  # Plain white background
    font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
    title=dict(
        font=dict(size=24, color="#FF6B6B"),  # Red title for theme
        x=0.5,
        xanchor="center",
        y=0.95,
        yanchor="top"
    ),
    xaxis=dict(
        title="Industry",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        tickangle=-45,
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True
    ),
    yaxis=dict(
        title="Number of Jobs",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    legend=dict(
        title="Job Category",
        font=dict(size=13),
        bgcolor="#FFFFFF",
        bordercolor="#FF6B6B",  # Red border for theme
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor="left",
        yanchor="middle"
    ),
    hovermode="closest",
    hoverlabel=dict(
        bgcolor="#FFFFFF",
        font_size=12,
        font_family="Inter, sans-serif",
        font_color="#2D3748",
        bordercolor="#FF6B6B"  # Red border for hover
    )
)

# Show the plot
fig.show()
Code
import plotly.express as px
import pandas as pd

# Prepare the data
df = eda.copy()

# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
    if row['DATA_ANALYST_JOB']:
        return True
    title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
    if 'business analyst' in title:
        return True
    return False

df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2

# Clean the data (remove rows with missing salary or experience)
df = df.dropna(subset=['Avg_Years_Experience', 'SALARY'])

# Create the scatter plot with trend line
fig = px.scatter(df, 
                 x='Avg_Years_Experience', 
                 y='SALARY', 
                 color='Job_Category',
                 trendline='ols',  # Add trend line (ordinary least squares)
                 title='Experience Requirements vs Salary for Analytics Jobs',
                 labels={'Avg_Years_Experience': 'Average Years of Experience', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'},
                 color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})

# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
    width=900,
    height=600,
    plot_bgcolor='#FFFFFF',  # Plain white background
    paper_bgcolor='#FFFFFF',  # Plain white background
    font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
    title=dict(
        font=dict(size=24, color="#FF6B6B"),  # Red title for theme
        x=0.5,
        xanchor="center",
        y=0.95,
        yanchor="top"
    ),
    xaxis=dict(
        title="Average Years of Experience",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    yaxis=dict(
        title="Salary ($)",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    legend=dict(
        title="Job Category",
        font=dict(size=13),
        bgcolor="#FFFFFF",
        bordercolor="#FF6B6B",  # Red border for theme
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor="left",
        yanchor="middle"
    ),
    hovermode="closest",
    hoverlabel=dict(
        bgcolor="#FFFFFF",
        font_size=12,
        font_family="Inter, sans-serif",
        font_color="#2D3748",
        bordercolor="#FF6B6B"  # Red border for hover
    )
)

# Customize scatter points
fig.update_traces(
    marker=dict(
        size=8,
        opacity=0.7,
        line=dict(width=1, color="#2D3748")
    )
)

# Show the plot
fig.show()
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[6], line 26
     23 df = df.dropna(subset=['Avg_Years_Experience', 'SALARY'])
     25 # Create the scatter plot with trend line
---> 26 fig = px.scatter(df, 
     27                  x='Avg_Years_Experience', 
     28                  y='SALARY', 
     29                  color='Job_Category',
     30                  trendline='ols',  # Add trend line (ordinary least squares)
     31                  title='Experience Requirements vs Salary for Analytics Jobs',
     32                  labels={'Avg_Years_Experience': 'Average Years of Experience', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'},
     33                  color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})
     35 # Beautify the layout with a red-white theme (no gradients)
     36 fig.update_layout(
     37     width=900,
     38     height=600,
   (...)
     89     )
     90 )

File ~/Documents/Semester2/Web_analytics/ad688-employability-sp25A1-group11/.venv/lib/python3.13/site-packages/plotly/express/_chart_types.py:69, in scatter(data_frame, x, y, color, symbol, size, hover_name, hover_data, custom_data, text, facet_row, facet_col, facet_col_wrap, facet_row_spacing, facet_col_spacing, error_x, error_x_minus, error_y, error_y_minus, animation_frame, animation_group, category_orders, labels, orientation, color_discrete_sequence, color_discrete_map, color_continuous_scale, range_color, color_continuous_midpoint, symbol_sequence, symbol_map, opacity, size_max, marginal_x, marginal_y, trendline, trendline_options, trendline_color_override, trendline_scope, log_x, log_y, range_x, range_y, render_mode, title, subtitle, template, width, height)
     14 def scatter(
     15     data_frame=None,
     16     x=None,
   (...)
     63     height=None,
     64 ) -> go.Figure:
     65     """
     66     In a scatter plot, each row of `data_frame` is represented by a symbol
     67     mark in 2D space.
     68     """
---> 69     return make_figure(args=locals(), constructor=go.Scatter)

File ~/Documents/Semester2/Web_analytics/ad688-employability-sp25A1-group11/.venv/lib/python3.13/site-packages/plotly/express/_core.py:2668, in make_figure(args, constructor, trace_patch, layout_patch)
   2665     elif args["ecdfnorm"] == "percent":
   2666         group = group.with_columns((nw.col(var) / group_sum) * 100.0)
-> 2668 patch, fit_results = make_trace_kwargs(
   2669     args, trace_spec, group, mapping_labels.copy(), sizeref
   2670 )
   2671 trace.update(patch)
   2672 if fit_results is not None:

File ~/Documents/Semester2/Web_analytics/ad688-employability-sp25A1-group11/.venv/lib/python3.13/site-packages/plotly/express/_core.py:430, in make_trace_kwargs(args, trace_spec, trace_data, mapping_labels, sizeref)
    427     trace_patch["x"] = trace_patch["x"].to_numpy()
    429 trendline_function = trendline_functions[attr_value]
--> 430 y_out, hover_header, fit_results = trendline_function(
    431     args["trendline_options"],
    432     sorted_trace_data.get_column(args["x"]),  # narwhals series
    433     x.to_numpy(),  # numpy array
    434     y.to_numpy(),  # numpy array
    435     args["x"],
    436     args["y"],
    437     non_missing.to_numpy(),  # numpy array
    438 )
    439 assert len(y_out) == len(
    440     trace_patch["x"]
    441 ), "missing-data-handling failure in trendline code"
    442 trace_patch["y"] = y_out

File ~/Documents/Semester2/Web_analytics/ad688-employability-sp25A1-group11/.venv/lib/python3.13/site-packages/plotly/express/trendline_functions/__init__.py:42, in ols(trendline_options, x_raw, x, y, x_label, y_label, non_missing)
     36     if k not in valid_options:
     37         raise ValueError(
     38             "OLS trendline_options keys must be one of [%s] but got '%s'"
     39             % (", ".join(valid_options), k)
     40         )
---> 42 import statsmodels.api as sm
     44 add_constant = trendline_options.get("add_constant", True)
     45 log_x = trendline_options.get("log_x", False)

ModuleNotFoundError: No module named 'statsmodels'
Code
import plotly.graph_objects as go
import pandas as pd

# Prepare the data
df = eda.copy()

# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
    if row['DATA_ANALYST_JOB']:
        return True
    title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
    if 'business analyst' in title:
        return True
    return False

df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Filter for Analytics jobs only
df_analytics = df[df['IS_ANALYTICS_JOB']].copy()

# Clean the data (remove rows with missing industry)
df_analytics = df_analytics.dropna(subset=['NAICS2_NAME'])

# Group by job category and industry to get job counts
df_grouped = df_analytics.groupby(['Job_Category', 'NAICS2_NAME']).size().reset_index(name='Job_Count')

# Shorten industry names for better readability
short_map = {
    'Professional, Scientific, and Technical Services': 'Prof. Services',
    'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
    'Health Care and Social Assistance': 'Healthcare',
    'Finance and Insurance': 'Finance',
    'Information': 'Info Tech',
    'Educational Services': 'Education',
    'Manufacturing': 'Manufacturing',
    'Retail Trade': 'Retail',
    'Accommodation and Food Services': 'Hospitality',
    'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['NAICS2_NAME'] = df_grouped['NAICS2_NAME'].map(short_map).fillna(df_grouped['NAICS2_NAME'])

# Prepare data for Sankey Diagram
# Create a list of unique labels (nodes)
labels = list(df_grouped['Job_Category'].unique()) + list(df_grouped['NAICS2_NAME'].unique())

# Create source and target indices
source = [labels.index(job_cat) for job_cat in df_grouped['Job_Category']]
target = [labels.index(industry) for industry in df_grouped['NAICS2_NAME']]
value = df_grouped['Job_Count'].tolist()

# Create the Sankey Diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="#2D3748", width=0.5),
        label=labels,
        color="#FF6B6B"  # Red nodes for the theme
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color="rgba(255, 107, 107, 0.5)"  # Semi-transparent red links
    )
)])

# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
    width=900,
    height=600,
    plot_bgcolor='#FFFFFF',  # Plain white background
    paper_bgcolor='#FFFFFF',  # Plain white background
    font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
    title=dict(
        text='Distribution of Analytics Job Postings by Industry',
        font=dict(size=24, color="#FF6B6B"),  # Red title for theme
        x=0.5,
        xanchor="center",
        y=0.95,
        yanchor="top"
    ),
    margin=dict(l=20, r=20, t=80, b=20),
)

# Show the plot
fig.show()